Load the necessary libraries
library(car) #for regression diagnostics
library(broom) #for tidy output
library(ggfortify) #for model diagnostics
library(sjPlot) #for outputs
library(knitr) #for kable
library(effects) #for partial effects plots
library(emmeans) #for estimating marginal means
library(ggeffects) #for partial effects plots
library(modelr) #for auxillary modelling functions
library(DHARMa) #for residual diagnostics plots
library(performance) #for residuals diagnostics
library(see) #for plotting residuals
library(tidyverse) #for data wrangling
Polis et al. (1998) were intested in modelling the presence/absence of lizards (Uta sp.) against the perimeter to area ratio of 19 islands in the Gulf of California.
Uta lizard
Format of polis.csv data file
| ISLAND | RATIO | PA |
|---|---|---|
| Bota | 15.41 | 1 |
| Cabeza | 5.63 | 1 |
| Cerraja | 25.92 | 1 |
| Coronadito | 15.17 | 0 |
| .. | .. | .. |
| ISLAND | Categorical listing of the name of the 19 islands used - variable not used in analysis. |
| RATIO | Ratio of perimeter to area of the island. |
| PA | Presence (1) or absence (0) of Uta lizards on island. |
The aim of the analysis is to investigate the relationship between island parimeter to area ratio and the presence/absence of Uta lizards.
polis = read_csv('../public/data/polis.csv', trim_ws=TRUE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## ISLAND = col_character(),
## RATIO = col_double(),
## PA = col_double()
## )
glimpse(polis)
## Rows: 19
## Columns: 3
## $ ISLAND <chr> "Bota", "Cabeza", "Cerraja", "Coronadito", "Flecha", "Gemelose…
## $ RATIO <dbl> 15.41, 5.63, 25.92, 15.17, 13.04, 18.85, 30.95, 22.87, 12.01, …
## $ PA <dbl> 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1
head(polis)
str(polis)
## tibble [19 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ISLAND: chr [1:19] "Bota" "Cabeza" "Cerraja" "Coronadito" ...
## $ RATIO : num [1:19] 15.41 5.63 25.92 15.17 13.04 ...
## $ PA : num [1:19] 1 1 1 0 1 0 0 0 0 1 ...
## - attr(*, "spec")=
## .. cols(
## .. ISLAND = col_character(),
## .. RATIO = col_double(),
## .. PA = col_double()
## .. )
Model formula: \[ y_i \sim{} \mathcal{Bin}(n, p_i)\\ ln\left(\frac{p_i}{1-p_i}\right) = \beta_0 + \beta_1 x_i \]
where \(y_i\) represents the \(i\) observed values, \(n\) represents the number of trials (in the case of logistic, this is always 1), \(p_i\) represents the probability of lizards being present in the \(i^{th}\) poluation, and \(\beta_0\) and \(\beta_1\) represent the intercept and slope respectively.
ggplot(polis, aes(y=PA, x=RATIO))+
geom_point()
ggplot(polis, aes(y=PA, x=RATIO))+
geom_point()+
geom_smooth(method='glm', formula=y~x,
method.args=list(family='binomial'))
polis.glm <- glm(PA ~ RATIO, family=binomial(link='logit'), data=polis)
polis$Total <- 1
polis.glm1 <- glm(PA ~ RATIO, family=binomial(link='logit'), data=polis, weights=Total)
polis$Total <- 1
polis.glm1 <- glm(cbind(PA,Total-PA) ~ RATIO, family=binomial(link='logit'), data=polis)
autoplot(polis.glm, which=1:6, label.repel=TRUE)
## there does seem to be an outlier that is influential - obs #3
## Perhaps we should redo the scatterplot but with text so that we can see which obs is #3
Conclusions:
polis %>% mutate(n=1:nrow(.)) %>%
ggplot(aes(y=PA, x=RATIO))+geom_text(aes(label=n))
## it seems that Uta lizards were present on this island dispite it having
## a relatively large surface area to volume ratio.
## Perhaps this island:
## - was a long way from other islands (isolated)
## - not inhabited
## - was very large
## - some other reason why it was not typical of the population.
## If no, then we cannot simply exclude the point.
## In anycase, if anything, this obs is going to result in greater variability and
## a more concervative test - so it is not really harming to leave it in.
influence.measures(polis.glm)
## Influence measures of
## glm(formula = PA ~ RATIO, family = binomial(link = "logit"), data = polis) :
##
## dfb.1_ dfb.RATI dffit cov.r cook.d hat inf
## 1 0.182077 -0.007083 0.447814 1.043 5.50e-02 0.109124
## 2 0.167005 -0.141263 0.169959 1.235 6.62e-03 0.111730
## 3 -0.723849 1.079157 1.278634 0.537 8.43e-01 0.151047 *
## 4 -0.239967 0.028419 -0.546081 0.953 9.01e-02 0.108681
## 5 0.248270 -0.126175 0.359999 1.117 3.30e-02 0.110025
## 6 0.028088 -0.196986 -0.437403 1.110 5.00e-02 0.129177
## 7 0.077131 -0.102575 -0.111591 1.250 2.81e-03 0.108288
## 8 0.140334 -0.247315 -0.332565 1.242 2.65e-02 0.155414
## 9 -0.562402 0.338850 -0.723598 0.805 1.89e-01 0.112842
## 10 0.257651 -0.162838 0.319655 1.157 2.52e-02 0.114067
## 11 0.176591 -0.147771 0.180516 1.234 7.49e-03 0.113765
## 12 0.104228 -0.093408 0.104419 1.225 2.46e-03 0.090774
## 13 0.135395 -0.118138 0.136380 1.233 4.23e-03 0.102909
## 14 0.000410 -0.000476 -0.000481 1.131 5.14e-08 0.001445
## 15 0.000218 -0.000251 -0.000254 1.130 1.43e-08 0.000817
## 16 0.139447 -0.248090 -0.335881 1.239 2.70e-02 0.155114
## 17 0.143708 -0.240774 -0.311977 1.255 2.31e-02 0.156543
## 18 0.074831 -0.068694 0.074832 1.211 1.26e-03 0.075520
## 19 0.108633 -0.097001 0.108890 1.226 2.68e-03 0.092718
performance::check_model(polis.glm)
## Not enough model terms in the conditional part of the model to check for multicollinearity.
performance::check_outliers(polis.glm)
## Warning: 1 outliers detected (cases 3).
performance::check_outliers(polis.glm) %>% plot
## These are probabilities of exceedance rather than actual Cook's D values
#https://easystats.github.io/performance/reference/check_outliers.html
performance::check_heteroscedasticity(polis.glm)
## OK: Error variance appears to be homoscedastic (p = 0.094).
Conclusions
polis.resid <- simulateResiduals(polis.glm, plot=TRUE)
Conclusions:
For simple models, we can explore a lack of fit via a goodness of fit test. Here we will do this via:
As the model was fit using maximum likelihood, it is not optimizing the sum square of residuals (like it is in OLS), therefore, it is arguably not really appropriate to be judging the performance of the model based on a property that it is not directly controlling. Deviance on the other hand is directly calculated from the likelihood and thus a fairer basis for gauging the goodness of the model’s fit.
##Check the model for lack of fit via:
##Pearson chisq
polis.ss <- sum(resid(polis.glm, type="pearson")^2)
1-pchisq(polis.ss, polis.glm$df.resid)
## [1] 0.5715331
#No evidence of a lack of fit
#Deviance
1-pchisq(polis.glm$deviance, polis.glm$df.resid)
## [1] 0.6514215
#No evidence of a lack of fit
Conclusions:
augment(polis.glm) %>%
ggplot() +
geom_point(aes(y=.resid, x=.fitted))
plot_model(polis.glm, type='eff', show.data=TRUE)
## $RATIO
plot(allEffects(polis.glm, residuals=TRUE), type='response')
ggpredict(polis.glm) %>% plot(add.data=TRUE)
## $RATIO
ggemmeans(polis.glm, ~RATIO) %>% plot(add.data=TRUE)
summary(polis.glm)
##
## Call:
## glm(formula = PA ~ RATIO, family = binomial(link = "logit"),
## data = polis)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6067 -0.6382 0.2368 0.4332 2.0986
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.6061 1.6953 2.127 0.0334 *
## RATIO -0.2196 0.1005 -2.184 0.0289 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26.287 on 18 degrees of freedom
## Residual deviance: 14.221 on 17 degrees of freedom
## AIC: 18.221
##
## Number of Fisher Scoring iterations: 6
Conclusions:
exp, log-laws dictate that they are applied multiplicatively.## on link scale (log odds)
confint(polis.glm)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 1.0059470 8.04214480
## RATIO -0.4849431 -0.06648957
## or on odds (ratio) scale
polis.glm %>% confint() %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.7344957 3109.2748308
## RATIO 0.6157322 0.9356727
tidy(polis.glm, conf.int=TRUE)
glance(polis.glm)
polis.glm %>% tidy(conf.int=TRUE) %>% kable
| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 3.6060693 | 1.6952580 | 2.127151 | 0.0334076 | 1.0059470 | 8.0421448 |
| RATIO | -0.2195582 | 0.1005191 | -2.184245 | 0.0289443 | -0.4849431 | -0.0664896 |
# warning this is only appropriate for html output
sjPlot::tab_model(polis.glm,show.se=TRUE,show.aic=TRUE)
| PA | ||||
|---|---|---|---|---|
| Predictors | Odds Ratios | std. Error | CI | p |
| (Intercept) | 36.82 | 1.70 | 2.73 – 3109.27 | 0.033 |
| RATIO | 0.80 | 0.10 | 0.62 – 0.94 | 0.029 |
| Observations | 19 | |||
| R2 Tjur | 0.521 | |||
| AIC | 18.221 | |||
In simple regression, the \(R^{2}\) value (coefficient of determination) is interpreted as the amount of variation in the response that can be explained by its relationship with the predictor(s) and is calculated as the sum of squares explained divided by the sum of squares total.
In Generalized Linear Models, we can calculate an equivalent property based on deviance (a measure of the total amount unexplained) as:
\[
1-\frac{Deviance}{Deviance_{NULL}}
\] where \(Deviance_{NULL}\) is the deviance of a null model (e.g. glm(PA ~ 1, data=polis, family='binomial'))
#R2
1-(polis.glm$deviance/polis.glm$null)
## [1] 0.4590197
Conclusions:
In some disciplines it is useful to be able to calculate an LD50. This is the value along the x-axis that corresponds to a proability of 50% - e.g. the switch-over point in Island perimerter to area Ratio at which the lizards go from more likely to be present to more likely to be absent. It is the inflection point.
It is also the point at which the slope (when backtransformed) is at its steapest and can be calculated as:
\[ -\frac{Intercept}{Slope} \]
#LD50
(ld50 <- -polis.glm$coef[1]/polis.glm$coef[2])
## (Intercept)
## 16.4242
## What about other points (not just 50%) along with confidence intervals..
ld=MASS::dose.p(polis.glm, p=c(0.5,0.9))
ld.SE = attr(ld, "SE")
ld = data.frame(LD = attr(ld,'p'),
Dose = as.vector(ld),
SE = ld.SE) %>%
mutate(lower=Dose-SE*qnorm(0.975),
upper=Dose+SE*qnorm(0.975))
ld
## Using emmeans
polis.grid = with(polis, list(RATIO = seq(min(RATIO), max(RATIO), len=100)))
polis.grid = polis %>% data_grid(RATIO=seq_range(RATIO, n=100))
newdata=emmeans(polis.glm, ~RATIO,at=polis.grid, type='response') %>%
as.data.frame
ggplot(newdata, aes(y=prob, x=RATIO))+
geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL), fill='blue',alpha=0.2)+
geom_line() +
theme_classic()
We could also include the raw data
ggplot(newdata, aes(y=prob, x=RATIO))+
geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL), fill='blue',alpha=0.2)+
geom_line() +
geom_point(data=polis, aes(y=PA, x=RATIO))+
theme_classic()
Perhaps mark the boundaries of the confidence intervals with dashed lines rather than a ribbon.
ggplot(newdata, aes(y=prob, x=RATIO))+
geom_line(aes(y=asymp.LCL), linetype='dashed') +
geom_line(aes(y=asymp.UCL), linetype='dashed') +
geom_line() +
geom_point(data=polis, aes(y=PA, x=RATIO))+
theme_classic()
For more complex models that include additional covariates, the partial plot represents the expected trend between the response and focal predictor holding all other predictors constant. This is akin to plotting the trend between the response and focal predictor after standardizing first for the other covariate(s).
When this is the case, it is not always appropriate to overlay a partial plot with raw data (as this data has not been standardized and will not necessarily represent the partial effects). Arguably a more appropriate way to represent the data points, is to calculate standardized version of the points by adding the residuals onto the fitted values associated with each of the observed values of the focal predictor. When there is only a single predictor, as is the case here, the result will be identical to just overlaying the raw data.
# Partial will represent the fitted values plus the residuals back transformed onto the probability scale
# Partial1 then backtransforms these onto the response scale [0,1]
partial.obs = polis %>%
mutate(Fit=predict(polis.glm, newdata=polis, type='link'),
Partial = Fit + resid(polis.glm, type='pearson'),
Partial = plogis(Partial),
Partial1 = qbinom(Partial, 1, 0.5))
ggplot(newdata, aes(y=prob, x=RATIO))+geom_line() +
geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL), fill='blue',alpha=0.2)+
geom_point(data=polis, aes(y=PA, x=RATIO))+
geom_point(data=partial.obs, aes(y=Partial1), color='green') +
geom_vline(xintercept=ld50, linetype='dashed') +
theme_classic()
newdata = polis %>%
modelr::add_predictions(polis.glm) %>%
modelr::add_residuals(polis.glm) %>%
mutate(Partial=pred + resid,
pred = plogis(pred))
ggplot(newdata, aes(y=pred, x=RATIO))+geom_line() +
#geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL), fill='blue',alpha=0.2)+
geom_point(data=polis, aes(y=PA, x=RATIO))+
geom_point(aes(y=Partial), color='green') +
geom_vline(xintercept=ld50, linetype='dashed') +
theme_classic()
polis.mod = polis %>% fit_with(glm, list(PA~RATIO), family=binomial())
polis.mod[[1]] %>% modelr::rsquare(polis)
## [1] -46.68859
map(polis.mod, ~rsquare(., polis))
## [[1]]
## [1] -46.68859
modelr::add_predictions(polis, polis.mod[[1]], type='response')
Polis, G. A., S. D. Hurd, C. D. Jackson, and F. Sanchez-Piñero. 1998. “Multifactor Population Limitation: Variable Spatial and Temporal Control of Spiders on Gulf of California Islands.” Ecology 79: 490–502.